Parameters that might affect performance

This notebook examines how parameters in the semantic model of the Danish language affects its performance.

Number of pages read
Use of stopwords
Exclusion of short pages
Scaling of matrix tfidf/count
Normalization of document
Factorization of matrix



In [1]:

    
from everything import *
from dasem.semantic import Semantic
from dasem.data import wordsim353 as wordsim353_data



In [2]:

    
# Read datasets
four_words = read_csv('../dasem/data/four_words.csv', encoding='utf-8')
wordsim353 = wordsim353_data()



In [3]:

    
def compute_accuracy(semantic, four_words):
    outlier = []
    for idx, words in four_words.iterrows():
        sorted_words = semantic.sort_by_outlierness(words.values[:4])
        outlier.append(sorted_words[0])

    accuracy = mean(four_words.word4 == outlier)
    return accuracy



In [4]:

    
def compute_correlation(semantic, wordsim):
    human = []
    relatednesses = []
    for idx, row in wordsim.iterrows():
        R = semantic.relatedness([row.da1, row.da2])
        relatednesses.append(R[0, 1])
        human.append(row['Human (mean)'])
    human = array(human)
    relatednesses = array(relatednesses)
    indices = (~isnan(relatednesses)).nonzero()[0]
    C = corrcoef(human[indices], relatednesses[indices])
    return C[0, 1]



In [5]:

    
max_n_pagess = [3000, 30000, None]
norms = ['l1', 'l2', None]
stop_wordss = [None, set(nltk.corpus.stopwords.words('danish'))]
use_idfs = [True, False]
sublinear_tfs = [True, False]

columns = ['accuracy', 'correlation', 'stop_words', 'use_idf', 'norm', 'sublinear_tf', 'max_n_pages']

n_total = len(max_n_pagess) * len(norms) * len(stop_wordss) * len(use_idfs) * \
    len(sublinear_tfs)
results = DataFrame(dtype=float, index=range(n_total), columns=columns)

n = 0
for stop_words_index, stop_words in (enumerate(stop_wordss)):
    for norm in (norms):
        for use_idf in (use_idfs):
            for sublinear_tf in (sublinear_tfs):
                for max_n_pages in (max_n_pagess):
                    results.ix[n, 'max_n_pages'] = max_n_pages
                    results.ix[n, 'stop_words'] = stop_words_index
                    results.ix[n, 'norm'] = str(norm)
                    results.ix[n, 'use_idf'] = use_idf
                    results.ix[n, 'sublinear_tf'] = sublinear_tf
                    semantic = Semantic(stop_words=stop_words, norm=norm,
                                        use_idf=use_idf, sublinear_tf=sublinear_tf,
                                        max_n_pages=max_n_pages)
                    results.ix[n, 'accuracy'] = compute_accuracy(semantic, four_words)
                    results.ix[n, 'correlation'] = compute_correlation(semantic, wordsim353)
                    n += 1



In [6]:

    
relatednesses = []
for idx, row in wordsim353.iterrows():
    R = semantic.relatedness([row.da1, row.da2])
    relatednesses.append(R[0, 1])
wordsim353['relatedness'] = relatednesses



In [7]:

    
wordsim353









    Out[7]:






  
    
      
      Word 1
      da1
      Word 2
      da2
      Human (mean)
      Problem
      relatedness
    
  
  
    
      0
      love
      kærlighed
      sex
      sex
      6.77
      NaN
      0.069031
    
    
      1
      tiger
      tiger
      cat
      kat
      7.35
      NaN
      0.024325
    
    
      2
      tiger
      tiger
      tiger
      tiger
      10.00
      NaN
      1.000000
    
    
      3
      book
      bog
      paper
      papir
      7.46
      NaN
      0.031266
    
    
      4
      computer
      computer
      keyboard
      tastatur
      7.62
      NaN
      0.117331
    
    
      5
      computer
      computer
      internet
      internet
      7.58
      NaN
      0.059367
    
    
      6
      plane
      fly
      car
      bil
      5.77
      NaN
      0.013637
    
    
      7
      train
      tog
      car
      bil
      6.31
      NaN
      0.026891
    
    
      8
      telephone
      telefon
      communication
      kommunikation
      7.50
      NaN
      0.007303
    
    
      9
      television
      tv
      radio
      radio
      6.77
      NaN
      0.164519
    
    
      10
      media
      medie
      radio
      radio
      7.42
      NaN
      0.032748
    
    
      11
      drug
      narkotika
      abuse
      misbrug
      6.85
      NaN
      0.074244
    
    
      12
      bread
      brød
      butter
      smør
      6.19
      NaN
      0.075498
    
    
      13
      cucumber
      agurk
      potato
      kartoffel
      5.92
      NaN
      0.070229
    
    
      14
      doctor
      læge
      nurse
      sygeplejerske
      7.00
      NaN
      0.078341
    
    
      15
      professor
      professor
      doctor
      læge
      6.62
      NaN
      0.135296
    
    
      16
      student
      studerende
      professor
      professor
      6.81
      NaN
      0.087950
    
    
      17
      smart
      klog
      student
      studerende
      4.62
      NaN
      0.007902
    
    
      18
      smart
      klog
      stupid
      dum
      5.81
      NaN
      0.033734
    
    
      19
      company
      firma
      stock
      aktie
      7.08
      NaN
      0.023840
    
    
      20
      stock
      aktie
      market
      marked
      8.08
      NaN
      0.036268
    
    
      21
      stock
      aktie
      phone
      telefon
      1.62
      NaN
      0.012286
    
    
      22
      stock
      aktie
      CD
      CD
      1.31
      NaN
      0.000482
    
    
      23
      stock
      aktie
      jaguar
      jaguar
      0.92
      NaN
      0.001671
    
    
      24
      stock
      aktie
      egg
      æg
      1.81
      NaN
      0.014525
    
    
      25
      fertility
      frugtbar
      egg
      æg
      6.69
      NaN
      0.019678
    
    
      27
      stock
      aktie
      life
      liv
      0.92
      NaN
      0.009396
    
    
      28
      book
      bog
      library
      bibliotek
      7.46
      NaN
      0.064770
    
    
      29
      bank
      bank
      money
      penge
      8.12
      NaN
      0.121945
    
    
      30
      wood
      træ
      forest
      skov
      7.73
      NaN
      0.040285
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      322
      gender
      køn
      equality
      lighed
      6.41
      NaN
      0.066694
    
    
      323
      change
      ændring
      attitude
      holdning
      5.44
      NaN
      0.086168
    
    
      324
      family
      familie
      planning
      planlægning
      6.25
      NaN
      0.003667
    
    
      325
      opera
      opera
      industry
      industri
      2.63
      NaN
      0.007351
    
    
      326
      sugar
      sukker
      approach
      tilgang
      0.88
      NaN
      0.012265
    
    
      327
      practice
      øvelse
      institution
      institution
      3.19
      NaN
      0.005296
    
    
      328
      ministry
      ministerium
      culture
      kultur
      4.69
      NaN
      0.031118
    
    
      329
      problem
      problem
      challenge
      udfordring
      6.75
      NaN
      0.055777
    
    
      330
      size
      størrelse
      prominence
      fremtrædende
      5.31
      NaN
      0.067736
    
    
      331
      country
      land
      citizen
      borger
      7.31
      NaN
      0.018616
    
    
      332
      planet
      planet
      people
      folk
      5.75
      NaN
      0.025209
    
    
      333
      development
      udvikling
      issue
      spørgsmål
      3.97
      NaN
      0.129536
    
    
      334
      experience
      oplevelse
      music
      musik
      3.47
      NaN
      0.045611
    
    
      335
      music
      musik
      project
      projekt
      3.63
      NaN
      0.047501
    
    
      336
      glass
      glas
      metal
      metal
      5.56
      NaN
      0.012299
    
    
      337
      aluminum
      aluminium
      metal
      metal
      7.83
      NaN
      0.019734
    
    
      338
      chance
      chance
      credibility
      troværdighed
      3.88
      NaN
      0.032975
    
    
      340
      concert
      koncert
      virtuoso
      virtuos
      6.81
      NaN
      0.012196
    
    
      341
      rock
      rock
      jazz
      jazz
      7.59
      NaN
      0.080152
    
    
      342
      museum
      museum
      theater
      teater
      7.19
      NaN
      0.043194
    
    
      343
      observation
      observation
      architecture
      arkitektur
      4.38
      NaN
      0.003176
    
    
      344
      space
      rum
      world
      verden
      6.53
      NaN
      0.083000
    
    
      345
      preservation
      bevarelse
      world
      verden
      6.19
      NaN
      0.023978
    
    
      346
      admission
      adgang
      ticket
      billet
      7.69
      NaN
      0.042346
    
    
      347
      shower
      byge
      thunderstorm
      tordenbyge
      6.31
      NaN
      0.000000
    
    
      348
      shower
      byge
      flood
      oversvømmelse
      6.03
      NaN
      0.010859
    
    
      349
      weather
      vejr
      forecast
      vejrudsigt
      8.34
      NaN
      0.062763
    
    
      350
      disaster
      katastrofe
      area
      område
      6.25
      NaN
      0.086862
    
    
      351
      governor
      guvernør
      office
      kontor
      6.34
      NaN
      0.022742
    
    
      352
      architecture
      arkitektur
      century
      århundrede
      3.78
      NaN
      0.093821
    
  

319 rows × 7 columns



In [12]:

    
wordsim353.plot(x='Human (mean)', y='relatedness', kind='scatter')
yscale('log')
ylim(0.0001, 1)
title('Scatter plot of Wordsim353 data')
show()



In [9]:

    
results









    Out[9]:






  
    
      
      accuracy
      correlation
      stop_words
      use_idf
      norm
      sublinear_tf
      max_n_pages
    
  
  
    
      0
      0.36
      0.274049
      0.0
      True
      l1
      True
      3000.0
    
    
      1
      0.56
      0.210682
      0.0
      True
      l1
      True
      30000.0
    
    
      2
      0.72
      0.135028
      0.0
      True
      l1
      True
      NaN
    
    
      3
      0.34
      0.292945
      0.0
      True
      l1
      False
      3000.0
    
    
      4
      0.60
      0.216162
      0.0
      True
      l1
      False
      30000.0
    
    
      5
      0.74
      0.137733
      0.0
      True
      l1
      False
      NaN
    
    
      6
      0.38
      0.279397
      0.0
      False
      l1
      True
      3000.0
    
    
      7
      0.56
      0.214376
      0.0
      False
      l1
      True
      30000.0
    
    
      8
      0.72
      0.138702
      0.0
      False
      l1
      True
      NaN
    
    
      9
      0.36
      0.292561
      0.0
      False
      l1
      False
      3000.0
    
    
      10
      0.58
      0.216529
      0.0
      False
      l1
      False
      30000.0
    
    
      11
      0.72
      0.140512
      0.0
      False
      l1
      False
      NaN
    
    
      12
      0.38
      0.351491
      0.0
      True
      l2
      True
      3000.0
    
    
      13
      0.64
      0.328927
      0.0
      True
      l2
      True
      30000.0
    
    
      14
      0.84
      0.261032
      0.0
      True
      l2
      True
      NaN
    
    
      15
      0.38
      0.331091
      0.0
      True
      l2
      False
      3000.0
    
    
      16
      0.58
      0.321919
      0.0
      True
      l2
      False
      30000.0
    
    
      17
      0.78
      0.254649
      0.0
      True
      l2
      False
      NaN
    
    
      18
      0.38
      0.350975
      0.0
      False
      l2
      True
      3000.0
    
    
      19
      0.58
      0.334352
      0.0
      False
      l2
      True
      30000.0
    
    
      20
      0.78
      0.268553
      0.0
      False
      l2
      True
      NaN
    
    
      21
      0.40
      0.324016
      0.0
      False
      l2
      False
      3000.0
    
    
      22
      0.60
      0.299873
      0.0
      False
      l2
      False
      30000.0
    
    
      23
      0.78
      0.242552
      0.0
      False
      l2
      False
      NaN
    
    
      24
      0.40
      0.334992
      0.0
      True
      None
      True
      3000.0
    
    
      25
      0.58
      0.378010
      0.0
      True
      None
      True
      30000.0
    
    
      26
      0.76
      0.365937
      0.0
      True
      None
      True
      NaN
    
    
      27
      0.44
      0.311267
      0.0
      True
      None
      False
      3000.0
    
    
      28
      0.56
      0.353467
      0.0
      True
      None
      False
      30000.0
    
    
      29
      0.72
      0.328984
      0.0
      True
      None
      False
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      42
      0.38
      0.279397
      1.0
      False
      l1
      True
      3000.0
    
    
      43
      0.56
      0.214376
      1.0
      False
      l1
      True
      30000.0
    
    
      44
      0.72
      0.138702
      1.0
      False
      l1
      True
      NaN
    
    
      45
      0.36
      0.292561
      1.0
      False
      l1
      False
      3000.0
    
    
      46
      0.58
      0.216529
      1.0
      False
      l1
      False
      30000.0
    
    
      47
      0.72
      0.140512
      1.0
      False
      l1
      False
      NaN
    
    
      48
      0.38
      0.351491
      1.0
      True
      l2
      True
      3000.0
    
    
      49
      0.64
      0.328927
      1.0
      True
      l2
      True
      30000.0
    
    
      50
      0.84
      0.261032
      1.0
      True
      l2
      True
      NaN
    
    
      51
      0.38
      0.331091
      1.0
      True
      l2
      False
      3000.0
    
    
      52
      0.58
      0.321919
      1.0
      True
      l2
      False
      30000.0
    
    
      53
      0.78
      0.254649
      1.0
      True
      l2
      False
      NaN
    
    
      54
      0.38
      0.350975
      1.0
      False
      l2
      True
      3000.0
    
    
      55
      0.58
      0.334352
      1.0
      False
      l2
      True
      30000.0
    
    
      56
      0.78
      0.268553
      1.0
      False
      l2
      True
      NaN
    
    
      57
      0.40
      0.324016
      1.0
      False
      l2
      False
      3000.0
    
    
      58
      0.60
      0.299873
      1.0
      False
      l2
      False
      30000.0
    
    
      59
      0.78
      0.242552
      1.0
      False
      l2
      False
      NaN
    
    
      60
      0.40
      0.334992
      1.0
      True
      None
      True
      3000.0
    
    
      61
      0.58
      0.378010
      1.0
      True
      None
      True
      30000.0
    
    
      62
      0.76
      0.365937
      1.0
      True
      None
      True
      NaN
    
    
      63
      0.44
      0.311267
      1.0
      True
      None
      False
      3000.0
    
    
      64
      0.56
      0.353467
      1.0
      True
      None
      False
      30000.0
    
    
      65
      0.72
      0.328984
      1.0
      True
      None
      False
      NaN
    
    
      66
      0.38
      0.334995
      1.0
      False
      None
      True
      3000.0
    
    
      67
      0.60
      0.377975
      1.0
      False
      None
      True
      30000.0
    
    
      68
      0.74
      0.365990
      1.0
      False
      None
      True
      NaN
    
    
      69
      0.42
      0.311267
      1.0
      False
      None
      False
      3000.0
    
    
      70
      0.56
      0.353424
      1.0
      False
      None
      False
      30000.0
    
    
      71
      0.72
      0.328968
      1.0
      False
      None
      False
      NaN
    
  

72 rows × 7 columns



In [10]:

    
formula = 'accuracy ~ stop_words + use_idf + norm + sublinear_tf + max_n_pages'
model = smf.glm(formula, data=results).fit()
model.summary()









    Out[10]:





Generalized Linear Model Regression Results

  Dep. Variable:       accuracy        No. Observations:            48      


  Model:                  GLM          Df Residuals:                41      


  Model Family:        Gaussian        Df Model:                     6      


  Link Function:       identity        Scale:              0.000536585365854


  Method:                IRLS          Log-Likelihood:           116.40     


  Date:            Tue, 11 Oct 2016    Deviance:               0.022000     


  Time:                01:13:12        Pearson chi2:            0.0220      


  No. Iterations:          2                                                




                          coef      std err       z       P>|z|   [0.025     0.975]  


  Intercept                 0.3713      0.009     40.567   0.000      0.353      0.389


  use_idf[T.True]           0.0017      0.007      0.249   0.803     -0.011      0.015


  norm[T.l1]               -0.0250      0.008     -3.053   0.002     -0.041     -0.009


  norm[T.l2]             1.509e-16      0.008   1.84e-14   1.000     -0.016      0.016


  sublinear_tf[T.True]     -0.0017      0.007     -0.249   0.803     -0.015      0.011


  stop_words             3.816e-17      0.007   5.71e-15   1.000     -0.013      0.013


  max_n_pages            7.346e-06   2.48e-07     29.660   0.000   6.86e-06   7.83e-06



In [ ]:

	Word 1	da1	Word 2	da2	Human (mean)	Problem	relatedness
0	love	kærlighed	sex	sex	6.77	NaN	0.069031
1	tiger	tiger	cat	kat	7.35	NaN	0.024325
2	tiger	tiger	tiger	tiger	10.00	NaN	1.000000
3	book	bog	paper	papir	7.46	NaN	0.031266
4	computer	computer	keyboard	tastatur	7.62	NaN	0.117331
5	computer	computer	internet	internet	7.58	NaN	0.059367
6	plane	fly	car	bil	5.77	NaN	0.013637
7	train	tog	car	bil	6.31	NaN	0.026891
8	telephone	telefon	communication	kommunikation	7.50	NaN	0.007303
9	television	tv	radio	radio	6.77	NaN	0.164519
10	media	medie	radio	radio	7.42	NaN	0.032748
11	drug	narkotika	abuse	misbrug	6.85	NaN	0.074244
12	bread	brød	butter	smør	6.19	NaN	0.075498
13	cucumber	agurk	potato	kartoffel	5.92	NaN	0.070229
14	doctor	læge	nurse	sygeplejerske	7.00	NaN	0.078341
15	professor	professor	doctor	læge	6.62	NaN	0.135296
16	student	studerende	professor	professor	6.81	NaN	0.087950
17	smart	klog	student	studerende	4.62	NaN	0.007902
18	smart	klog	stupid	dum	5.81	NaN	0.033734
19	company	firma	stock	aktie	7.08	NaN	0.023840
20	stock	aktie	market	marked	8.08	NaN	0.036268
21	stock	aktie	phone	telefon	1.62	NaN	0.012286
22	stock	aktie	CD	CD	1.31	NaN	0.000482
23	stock	aktie	jaguar	jaguar	0.92	NaN	0.001671
24	stock	aktie	egg	æg	1.81	NaN	0.014525
25	fertility	frugtbar	egg	æg	6.69	NaN	0.019678
27	stock	aktie	life	liv	0.92	NaN	0.009396
28	book	bog	library	bibliotek	7.46	NaN	0.064770
29	bank	bank	money	penge	8.12	NaN	0.121945
30	wood	træ	forest	skov	7.73	NaN	0.040285
...	...	...	...	...	...	...	...
322	gender	køn	equality	lighed	6.41	NaN	0.066694
323	change	ændring	attitude	holdning	5.44	NaN	0.086168
324	family	familie	planning	planlægning	6.25	NaN	0.003667
325	opera	opera	industry	industri	2.63	NaN	0.007351
326	sugar	sukker	approach	tilgang	0.88	NaN	0.012265
327	practice	øvelse	institution	institution	3.19	NaN	0.005296
328	ministry	ministerium	culture	kultur	4.69	NaN	0.031118
329	problem	problem	challenge	udfordring	6.75	NaN	0.055777
330	size	størrelse	prominence	fremtrædende	5.31	NaN	0.067736
331	country	land	citizen	borger	7.31	NaN	0.018616
332	planet	planet	people	folk	5.75	NaN	0.025209
333	development	udvikling	issue	spørgsmål	3.97	NaN	0.129536
334	experience	oplevelse	music	musik	3.47	NaN	0.045611
335	music	musik	project	projekt	3.63	NaN	0.047501
336	glass	glas	metal	metal	5.56	NaN	0.012299
337	aluminum	aluminium	metal	metal	7.83	NaN	0.019734
338	chance	chance	credibility	troværdighed	3.88	NaN	0.032975
340	concert	koncert	virtuoso	virtuos	6.81	NaN	0.012196
341	rock	rock	jazz	jazz	7.59	NaN	0.080152
342	museum	museum	theater	teater	7.19	NaN	0.043194
343	observation	observation	architecture	arkitektur	4.38	NaN	0.003176
344	space	rum	world	verden	6.53	NaN	0.083000
345	preservation	bevarelse	world	verden	6.19	NaN	0.023978
346	admission	adgang	ticket	billet	7.69	NaN	0.042346
347	shower	byge	thunderstorm	tordenbyge	6.31	NaN	0.000000
348	shower	byge	flood	oversvømmelse	6.03	NaN	0.010859
349	weather	vejr	forecast	vejrudsigt	8.34	NaN	0.062763
350	disaster	katastrofe	area	område	6.25	NaN	0.086862
351	governor	guvernør	office	kontor	6.34	NaN	0.022742
352	architecture	arkitektur	century	århundrede	3.78	NaN	0.093821

	accuracy	correlation	stop_words	use_idf	norm	sublinear_tf	max_n_pages
0	0.36	0.274049	0.0	True	l1	True	3000.0
1	0.56	0.210682	0.0	True	l1	True	30000.0
2	0.72	0.135028	0.0	True	l1	True	NaN
3	0.34	0.292945	0.0	True	l1	False	3000.0
4	0.60	0.216162	0.0	True	l1	False	30000.0
5	0.74	0.137733	0.0	True	l1	False	NaN
6	0.38	0.279397	0.0	False	l1	True	3000.0
7	0.56	0.214376	0.0	False	l1	True	30000.0
8	0.72	0.138702	0.0	False	l1	True	NaN
9	0.36	0.292561	0.0	False	l1	False	3000.0
10	0.58	0.216529	0.0	False	l1	False	30000.0
11	0.72	0.140512	0.0	False	l1	False	NaN
12	0.38	0.351491	0.0	True	l2	True	3000.0
13	0.64	0.328927	0.0	True	l2	True	30000.0
14	0.84	0.261032	0.0	True	l2	True	NaN
15	0.38	0.331091	0.0	True	l2	False	3000.0
16	0.58	0.321919	0.0	True	l2	False	30000.0
17	0.78	0.254649	0.0	True	l2	False	NaN
18	0.38	0.350975	0.0	False	l2	True	3000.0
19	0.58	0.334352	0.0	False	l2	True	30000.0
20	0.78	0.268553	0.0	False	l2	True	NaN
21	0.40	0.324016	0.0	False	l2	False	3000.0
22	0.60	0.299873	0.0	False	l2	False	30000.0
23	0.78	0.242552	0.0	False	l2	False	NaN
24	0.40	0.334992	0.0	True	None	True	3000.0
25	0.58	0.378010	0.0	True	None	True	30000.0
26	0.76	0.365937	0.0	True	None	True	NaN
27	0.44	0.311267	0.0	True	None	False	3000.0
28	0.56	0.353467	0.0	True	None	False	30000.0
29	0.72	0.328984	0.0	True	None	False	NaN
...	...	...	...	...	...	...	...
42	0.38	0.279397	1.0	False	l1	True	3000.0
43	0.56	0.214376	1.0	False	l1	True	30000.0
44	0.72	0.138702	1.0	False	l1	True	NaN
45	0.36	0.292561	1.0	False	l1	False	3000.0
46	0.58	0.216529	1.0	False	l1	False	30000.0
47	0.72	0.140512	1.0	False	l1	False	NaN
48	0.38	0.351491	1.0	True	l2	True	3000.0
49	0.64	0.328927	1.0	True	l2	True	30000.0
50	0.84	0.261032	1.0	True	l2	True	NaN
51	0.38	0.331091	1.0	True	l2	False	3000.0
52	0.58	0.321919	1.0	True	l2	False	30000.0
53	0.78	0.254649	1.0	True	l2	False	NaN
54	0.38	0.350975	1.0	False	l2	True	3000.0
55	0.58	0.334352	1.0	False	l2	True	30000.0
56	0.78	0.268553	1.0	False	l2	True	NaN
57	0.40	0.324016	1.0	False	l2	False	3000.0
58	0.60	0.299873	1.0	False	l2	False	30000.0
59	0.78	0.242552	1.0	False	l2	False	NaN
60	0.40	0.334992	1.0	True	None	True	3000.0
61	0.58	0.378010	1.0	True	None	True	30000.0
62	0.76	0.365937	1.0	True	None	True	NaN
63	0.44	0.311267	1.0	True	None	False	3000.0
64	0.56	0.353467	1.0	True	None	False	30000.0
65	0.72	0.328984	1.0	True	None	False	NaN
66	0.38	0.334995	1.0	False	None	True	3000.0
67	0.60	0.377975	1.0	False	None	True	30000.0
68	0.74	0.365990	1.0	False	None	True	NaN
69	0.42	0.311267	1.0	False	None	False	3000.0
70	0.56	0.353424	1.0	False	None	False	30000.0
71	0.72	0.328968	1.0	False	None	False	NaN

Dep. Variable:	accuracy	No. Observations:	48
Model:	GLM	Df Residuals:	41
Model Family:	Gaussian	Df Model:	6
Link Function:	identity	Scale:	0.000536585365854
Method:	IRLS	Log-Likelihood:	116.40
Date:	Tue, 11 Oct 2016	Deviance:	0.022000
Time:	01:13:12	Pearson chi2:	0.0220
No. Iterations:	2

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	0.3713	0.009	40.567	0.000	0.353	0.389
use_idf[T.True]	0.0017	0.007	0.249	0.803	-0.011	0.015
norm[T.l1]	-0.0250	0.008	-3.053	0.002	-0.041	-0.009
norm[T.l2]	1.509e-16	0.008	1.84e-14	1.000	-0.016	0.016
sublinear_tf[T.True]	-0.0017	0.007	-0.249	0.803	-0.015	0.011
stop_words	3.816e-17	0.007	5.71e-15	1.000	-0.013	0.013
max_n_pages	7.346e-06	2.48e-07	29.660	0.000	6.86e-06	7.83e-06